from sklearn.datasets import load_files
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

# Load the dataset, handling encoding errors gracefully
data = load_files('data/bbcsport-fulltext/bbcsport', encoding='utf-8', decode_error='replace')

# Convert the data into a pandas DataFrame
df = pd.DataFrame(list(zip(data['data'], data['target'])), columns=['text', 'label'])

# Display the first few rows
print(df.head())

                                                text  label
0  England victory tainted by history\n\nAs Engla...      1
1  Australia complete sweep\n\nThird Test, Sydney...      1
2  UK Athletics agrees new kit deal\n\nUK Athleti...      0
3  Bekele sets sights on world mark\n\nOlympic 10...      0
4  Captains lining up for Aid match\n\nIreland's ...      3

labels, counts = np.unique(df['label'], return_counts=True) # np.unique(data.target, return_counts=True)

print(dict(zip(data.target_names, counts)))

{'business': 510, 'entertainment': 386, 'politics': 417, 'sport': 511, 'tech': 401}

X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=random_state)

#tokenizer to remove unwanted elements from out data like symbols
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
# If you have memory issues, reduce the max_features value so you can continue with the practical
vectorizer = CountVectorizer(lowercase=True,
                             tokenizer=token.tokenize,
                             stop_words='english',
                             ngram_range=(1, 2),
                             analyzer='word',
                             min_df=3,
                             max_features=None)

# fit_transform() does two functions: First, it fits the model and learns the vocabulary;
# second, it transforms our data into feature vectors.
# The input to fit_transform should be a list of strings.
bbc_dtm = vectorizer.fit_transform(X_train)
print(bbc_dtm.shape)

/usr/local/lib/python3.10/dist-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

(1780, 25223)

importance = np.argsort(np.asarray(bbc_dtm.sum(axis=0)).ravel())[::-1]
feature_names = np.array(vectorizer.get_feature_names_out())
feature_names[importance[:20]]

array(['s', 'said', 'mr', 'year', 'people', 'new', 't', 'time', 'world',
       'government', 'uk', 'years', 'best', 'just', 'told', 'film',
       'make', '1', 'game', 'like'], dtype=object)

X_test_vectorized = vectorizer.transform(X_test)

ch2 = SelectKBest(chi2, k=20)
ch2.fit_transform(bbc_dtm, y_train)

<1780x20 sparse matrix of type '<class 'numpy.int64'>'
	with 4428 stored elements in Compressed Sparse Row format>

feature_names_chi = [feature_names[i] for i
                         in ch2.get_support(indices=True)]

feature_names_chi

['best',
 'blair',
 'brown',
 'computer',
 'digital',
 'election',
 'film',
 'government',
 'labour',
 'minister',
 'mobile',
 'mr',
 'mr blair',
 'music',
 'net',
 'party',
 'people',
 'software',
 'technology',
 'users']

mutual_info = SelectKBest(mutual_info_classif, k=20)
mutual_info.fit_transform(bbc_dtm, y_train)

<1780x20 sparse matrix of type '<class 'numpy.int64'>'
	with 6350 stored elements in Compressed Sparse Row format>

feature_names_mutual_info = [feature_names[i] for i
                         in mutual_info.get_support(indices=True)]
feature_names_mutual_info

['blair',
 'coach',
 'election',
 'film',
 'firm',
 'game',
 'government',
 'labour',
 'market',
 'minister',
 'mr',
 'music',
 'party',
 'people',
 'said',
 'secretary',
 'technology',
 'tory',
 'users',
 'win']

# X_train = mutual_info.fit_transform(bbc_dtm, y_train)
# X_test = mutual_info.transform(X_test_vectorized)

print("shape of the matrix before applying the embedded feature selection:", bbc_dtm.shape)

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False)
model = SelectFromModel(lsvc).fit(bbc_dtm, y_train) # you can add threshold=0.18 as another argument to select features that have an importance of more than 0.18
X_new = model.transform(bbc_dtm)
print("shape of the matrix after applying the embedded feature selection:", X_new.shape)

shape of the matrix before applying the embedded feature selection: (1780, 25223)
shape of the matrix after applying the embedded feature selection: (1780, 154)

model

SelectFromModel(estimator=LinearSVC(C=0.01, dual=False, penalty='l1'))

SelectFromModel(estimator=LinearSVC(C=0.01, dual=False, penalty='l1'))

LinearSVC(C=0.01, dual=False, penalty='l1')

LinearSVC(C=0.01, dual=False, penalty='l1')

# you can also check the coefficient values
model.estimator_.coef_

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

model.get_support()

array([False, False, False, ..., False, False, False])

print("Features selected by SelectFromModel: ", feature_names[model.get_support()])

Features selected by SelectFromModel:  ['000' '1' '2' '2004' '6' 'airlines' 'album' 'analysts' 'apple' 'athens'
 'athletics' 'award' 'ballet' 'ban' 'band' 'bank' 'bbc' 'best' 'bid'
 'blair' 'blog' 'book' 'britain' 'broadband' 'brown' 'business' 'champion'
 'chart' 'chelsea' 'chief' 'children' 'china' 'club' 'coach' 'comedy'
 'companies' 'company' 'computer' 'conte' 'content' 'council' 'cup' 'data'
 'deal' 'digital' 'dollar' 'doping' 'drugs' 'e' 'economic' 'economy'
 'education' 'election' 'england' 'eu' 'european' 'euros' 'film'
 'financial' 'firm' 'firms' 'fraud' 'game' 'games' 'gaming' 'glazer'
 'good' 'government' 'group' 'growth' 'high' 'home' 'howard' 'iaaf'
 'information' 'injury' 'just' 'labour' 'league' 'like' 'liverpool' 'lord'
 'm' 'make' 'market' 'match' 'microsoft' 'million' 'minister' 'mobile'
 'mps' 'mr' 'music' 'musical' 'net' 'new' 'nintendo' 'number' 'o' 'oil'
 'old' 'olympic' 'online' 'party' 'people' 'plans' 'play' 'players'
 'police' 'president' 'prices' 'public' 'rights' 'rugby' 's' 'said'
 'sales' 'says' 'season' 'secretary' 'series' 'service' 'services' 'set'
 'shares' 'singer' 'site' 'software' 'sony' 'spam' 'star' 'stars' 'state'
 't' 'team' 'technology' 'time' 'trade' 'tv' 'uk' 'united' 'use' 'used'
 'users' 'using' 'video' 'virus' 'web' 'website' 'win' 'won' 'world'
 'year' 'year old']

clf1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('feature_extraction', TfidfTransformer()),
    ('classification', RandomForestClassifier())
])

clf1.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('classification', RandomForestClassifier())])

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('classification', RandomForestClassifier())])

CountVectorizer()

TfidfTransformer()

RandomForestClassifier()

y_pred1 = clf1.predict(X_test)
print(metrics.classification_report(y_test, y_pred1, target_names=data.target_names))

               precision    recall  f1-score   support

     business       0.95      0.96      0.95        92
entertainment       0.98      0.94      0.96        84
     politics       0.93      0.92      0.93        77
        sport       0.97      0.99      0.98       111
         tech       0.96      0.98      0.97        81

     accuracy                           0.96       445
    macro avg       0.96      0.96      0.96       445
 weighted avg       0.96      0.96      0.96       445

clf2 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('feature_extraction', TfidfTransformer()),
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
    ('classification', RandomForestClassifier())
])

clf2.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', RandomForestClassifier())])

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', RandomForestClassifier())])

CountVectorizer()

TfidfTransformer()

SelectFromModel(estimator=LinearSVC(dual=False, penalty='l1'))

LinearSVC(dual=False, penalty='l1')

LinearSVC(dual=False, penalty='l1')

RandomForestClassifier()

y_pred2 = clf2.predict(X_test)

print(metrics.classification_report(y_test, y_pred2, target_names=data.target_names))

               precision    recall  f1-score   support

     business       0.91      0.93      0.92        92
entertainment       0.96      0.93      0.95        84
     politics       0.91      0.91      0.91        77
        sport       1.00      0.99      1.00       111
         tech       0.94      0.96      0.95        81

     accuracy                           0.95       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445

clf3 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('feature_extraction', TfidfTransformer()),
    ('feature_selection', SelectKBest(chi2, k=20)),
    ('classification', RandomForestClassifier())
])

clf3.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectKBest(k=20,
                             score_func=<function chi2 at 0x7fd8444d23b0>)),
                ('classification', RandomForestClassifier())])

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectKBest(k=20,
                             score_func=<function chi2 at 0x7fd8444d23b0>)),
                ('classification', RandomForestClassifier())])

CountVectorizer()

TfidfTransformer()

SelectKBest(k=20, score_func=<function chi2 at 0x7fd8444d23b0>)

RandomForestClassifier()

y_pred3 = clf3.predict(X_test)
print(metrics.classification_report(y_test, y_pred3, target_names=data.target_names))

               precision    recall  f1-score   support

     business       0.65      0.46      0.54        92
entertainment       0.80      0.57      0.67        84
     politics       0.79      0.73      0.76        77
        sport       0.63      0.98      0.76       111
         tech       0.88      0.81      0.85        81

     accuracy                           0.72       445
    macro avg       0.75      0.71      0.71       445
 weighted avg       0.74      0.72      0.71       445

clf4 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('feature_extraction', TfidfTransformer()),
    ('feature_selection', SelectKBest(chi2, k=200)),
    ('classification', RandomForestClassifier())
])

clf4.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectKBest(k=200,
                             score_func=<function chi2 at 0x7fd8444d23b0>)),
                ('classification', RandomForestClassifier())])

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectKBest(k=200,
                             score_func=<function chi2 at 0x7fd8444d23b0>)),
                ('classification', RandomForestClassifier())])

CountVectorizer()

TfidfTransformer()

SelectKBest(k=200, score_func=<function chi2 at 0x7fd8444d23b0>)

RandomForestClassifier()

y_pred4 = clf4.predict(X_test)
print(metrics.classification_report(y_test, y_pred4, target_names=data.target_names))

               precision    recall  f1-score   support

     business       0.86      0.92      0.89        92
entertainment       0.97      0.90      0.94        84
     politics       0.92      0.88      0.90        77
        sport       0.99      0.98      0.99       111
         tech       0.93      0.96      0.95        81

     accuracy                           0.93       445
    macro avg       0.93      0.93      0.93       445
 weighted avg       0.94      0.93      0.94       445

clf5 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('feature_extraction', TfidfTransformer()),
    ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
    ('classification', MultinomialNB(alpha=0.01))
])

clf5.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', MultinomialNB(alpha=0.01))])

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('feature_extraction', TfidfTransformer()),
                ('feature_selection',
                 SelectFromModel(estimator=LinearSVC(dual=False,
                                                     penalty='l1'))),
                ('classification', MultinomialNB(alpha=0.01))])

CountVectorizer()

TfidfTransformer()

SelectFromModel(estimator=LinearSVC(dual=False, penalty='l1'))

LinearSVC(dual=False, penalty='l1')

LinearSVC(dual=False, penalty='l1')

MultinomialNB(alpha=0.01)

y_pred5 = clf5.predict(X_test)
print(metrics.classification_report(y_test, y_pred5, target_names=data.target_names))

               precision    recall  f1-score   support

     business       0.96      0.93      0.95        92
entertainment       1.00      0.94      0.97        84
     politics       0.95      0.99      0.97        77
        sport       1.00      1.00      1.00       111
         tech       0.93      0.98      0.95        81

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

Practical 4: Feature Selection¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Let's get started!¶

Filter-based feature selection¶

Embedded feature selection¶

Model comparison¶